import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px
import seaborn as sns
import matplotlib.ticker as mtick
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.tree import plot_tree
from sklearn.pipeline import Pipeline
#!pip install category_encoders
from category_encoders.target_encoder import TargetEncoder
from xgboost import XGBClassifier
#!pip install scikit-optimize
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from xgboost import plot_importance
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
from sklearn.model_selection import cross_val_score, KFold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectFromModel
import warnings
# Ignore all warnings
warnings.filterwarnings("ignore")
#!pip install pyECLAT
df = pd.read_csv('testing edited 5.2.csv')
df.shape
(400, 26)
df.columns
Index(['Gender', 'Age', 'BMI', 'Duration in years', 'Sodium Level',
'Potassium Level', 'Createnine level', 'uric acid', 'HBA1C',
'Urea level', 'eGFR', 'protien level', 'ALBUMIN', 'ALT', 'AST',
'calcium', 'HDL', 'LDL', 'Maculopathy', 'Hypertension', 'Dyslipidemia',
'Cardiovascular Disease', 'Diabetic Neuropathy',
'Diabetic Macular Edema', 'Chronic Kidney Disease', 'RETINOPATHY'],
dtype='object')
df.head(10)
| Gender | Age | BMI | Duration in years | Sodium Level | Potassium Level | Createnine level | uric acid | HBA1C | Urea level | ... | HDL | LDL | Maculopathy | Hypertension | Dyslipidemia | Cardiovascular Disease | Diabetic Neuropathy | Diabetic Macular Edema | Chronic Kidney Disease | RETINOPATHY | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 66 | 27 | 18 | 138 | 4.51 | 79.0 | 249.0 | 11.2 | 4.64 | ... | 1.02 | 2.14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | no |
| 1 | 0 | 62 | 36 | 16 | 140 | 3.46 | 47.0 | 256.0 | 7.5 | 3.51 | ... | 1.23 | 0.85 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | no |
| 2 | 0 | 75 | 32 | 22 | 144 | 4.85 | 46.8 | 255.0 | 5.9 | 4.78 | ... | 1.50 | 2.26 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | no |
| 3 | 1 | 63 | 30 | 8 | 136 | 4.11 | 172.0 | 361.0 | 7.8 | 7.60 | ... | 1.07 | 2.11 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | no |
| 4 | 0 | 62 | 22 | 12 | 140 | 4.68 | 44.7 | 190.0 | 7.1 | 4.50 | ... | 1.40 | 4.09 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | no |
| 5 | 0 | 76 | 21 | 8 | 139 | 4.34 | 73.3 | 333.0 | 6.5 | 5.92 | ... | 1.48 | 2.86 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | no |
| 6 | 0 | 79 | 26 | 20 | 137 | 4.22 | 84.0 | 358.0 | 6.6 | 11.68 | ... | 0.94 | 0.92 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | no |
| 7 | 0 | 57 | 27 | 14 | 143 | 4.23 | 41.6 | 249.0 | 7.6 | 3.59 | ... | 1.43 | 1.99 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | yes |
| 8 | 0 | 60 | 26 | 16 | 139 | 4.51 | 51.0 | 246.0 | 7.4 | 2.68 | ... | 1.32 | 2.10 | 1 | 1 | 1 | 0 | 0 | 1 | 0 | yes |
| 9 | 0 | 80 | 16 | 15 | 131 | 4.59 | 376.0 | 272.0 | 4.6 | 8.98 | ... | 1.42 | 3.29 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | no |
10 rows × 26 columns
df.dtypes
Gender int64 Age int64 BMI object Duration in years int64 Sodium Level int64 Potassium Level float64 Createnine level float64 uric acid float64 HBA1C float64 Urea level float64 eGFR float64 protien level float64 ALBUMIN float64 ALT float64 AST float64 calcium float64 HDL float64 LDL float64 Maculopathy int64 Hypertension int64 Dyslipidemia int64 Cardiovascular Disease int64 Diabetic Neuropathy int64 Diabetic Macular Edema int64 Chronic Kidney Disease int64 RETINOPATHY object dtype: object
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 400 entries, 0 to 399 Data columns (total 26 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Gender 400 non-null int64 1 Age 400 non-null int64 2 BMI 400 non-null object 3 Duration in years 400 non-null int64 4 Sodium Level 400 non-null int64 5 Potassium Level 400 non-null float64 6 Createnine level 400 non-null float64 7 uric acid 400 non-null float64 8 HBA1C 400 non-null float64 9 Urea level 400 non-null float64 10 eGFR 400 non-null float64 11 protien level 400 non-null float64 12 ALBUMIN 400 non-null float64 13 ALT 400 non-null float64 14 AST 400 non-null float64 15 calcium 400 non-null float64 16 HDL 400 non-null float64 17 LDL 400 non-null float64 18 Maculopathy 400 non-null int64 19 Hypertension 400 non-null int64 20 Dyslipidemia 400 non-null int64 21 Cardiovascular Disease 400 non-null int64 22 Diabetic Neuropathy 400 non-null int64 23 Diabetic Macular Edema 400 non-null int64 24 Chronic Kidney Disease 400 non-null int64 25 RETINOPATHY 400 non-null object dtypes: float64(13), int64(11), object(2) memory usage: 81.4+ KB
df.describe()
| Gender | Age | Duration in years | Sodium Level | Potassium Level | Createnine level | uric acid | HBA1C | Urea level | eGFR | ... | calcium | HDL | LDL | Maculopathy | Hypertension | Dyslipidemia | Cardiovascular Disease | Diabetic Neuropathy | Diabetic Macular Edema | Chronic Kidney Disease | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 400.000000 | 400.000000 | 400.000000 | 400.00000 | 400.000000 | 400.00000 | 400.000000 | 400.000000 | 400.000000 | 400.000000 | ... | 400.000000 | 400.000000 | 400.000000 | 400.000000 | 400.000000 | 400.000000 | 400.000000 | 400.000000 | 400.00000 | 400.000000 |
| mean | 0.367500 | 64.440000 | 15.715000 | 139.13250 | 4.374850 | 115.28770 | 286.192450 | 8.017250 | 7.135450 | 76.653000 | ... | 2.225050 | 1.374325 | 2.209800 | 0.440000 | 0.807500 | 0.557500 | 0.030000 | 0.085000 | 0.23500 | 0.280000 |
| std | 0.482728 | 11.500306 | 7.870109 | 3.71948 | 0.572578 | 153.70544 | 100.706813 | 1.879564 | 5.209383 | 28.149076 | ... | 0.201349 | 0.465009 | 1.021514 | 0.497009 | 0.394757 | 0.497305 | 0.170801 | 0.279231 | 0.42453 | 0.449561 |
| min | 0.000000 | 8.000000 | 3.000000 | 122.00000 | 2.670000 | 5.30000 | 2.300000 | 4.400000 | 1.700000 | 3.400000 | ... | 1.010000 | 0.050000 | 0.400000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 |
| 25% | 0.000000 | 58.000000 | 9.750000 | 137.00000 | 4.000000 | 56.97500 | 221.000000 | 6.700000 | 4.187500 | 61.000000 | ... | 2.160000 | 1.080000 | 1.500000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 |
| 50% | 0.000000 | 66.000000 | 15.000000 | 140.00000 | 4.345000 | 74.00000 | 276.500000 | 7.500000 | 5.700000 | 82.000000 | ... | 2.240000 | 1.300000 | 2.010000 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 |
| 75% | 1.000000 | 73.000000 | 20.000000 | 141.00000 | 4.700000 | 101.77500 | 348.250000 | 9.025000 | 8.000000 | 96.200000 | ... | 2.310000 | 1.602500 | 2.700000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.00000 | 1.000000 |
| max | 1.000000 | 95.000000 | 39.000000 | 149.00000 | 6.590000 | 1286.00000 | 697.300000 | 18.200000 | 43.000000 | 129.600000 | ... | 4.200000 | 3.200000 | 6.300000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.00000 | 1.000000 |
8 rows × 24 columns
df.isnull().any()
Gender False Age False BMI False Duration in years False Sodium Level False Potassium Level False Createnine level False uric acid False HBA1C False Urea level False eGFR False protien level False ALBUMIN False ALT False AST False calcium False HDL False LDL False Maculopathy False Hypertension False Dyslipidemia False Cardiovascular Disease False Diabetic Neuropathy False Diabetic Macular Edema False Chronic Kidney Disease False RETINOPATHY False dtype: bool
df.isnull().sum()
Gender 0 Age 0 BMI 0 Duration in years 0 Sodium Level 0 Potassium Level 0 Createnine level 0 uric acid 0 HBA1C 0 Urea level 0 eGFR 0 protien level 0 ALBUMIN 0 ALT 0 AST 0 calcium 0 HDL 0 LDL 0 Maculopathy 0 Hypertension 0 Dyslipidemia 0 Cardiovascular Disease 0 Diabetic Neuropathy 0 Diabetic Macular Edema 0 Chronic Kidney Disease 0 RETINOPATHY 0 dtype: int64
It can be seen that there are no missing values in the data set
print(f'we have : {df.duplicated().sum()} duplicated values ')
we have : 9 duplicated values
We do find 9 duplicate values in thed dataset:
df['BMI'].unique()
array(['27', '36', '32', '30', '22', '21', '26', '16', '29', '33', '39',
'34', '40', '35', '28', '51', '23', '24', '41', '31', '37', '25',
'17', '30.8', '19', '14', '52', '47', '45', '38', '46', '18', '44',
'49', '20', '42', '50', '43', '?'], dtype=object)
df['BMI'] = df['BMI'].str.replace('?', df['BMI'].mode()[0])
df['BMI'] = df['BMI'].astype(float)
# plot histograms for each numerical variable to understand their distributions
numerical_variables = df.select_dtypes(include=[np.number]).columns.tolist()
for var in numerical_variables:
plt.figure(figsize=(10, 6))
sns.histplot(df[var], bins=30, kde=True, color='purple')
plt.title(f'Histogram of {var}')
plt.xlabel(var)
plt.ylabel('Frequency')
plt.show()
ValueCounts_RETINOPATHY = df['RETINOPATHY'].value_counts()
ValueCounts_RETINOPATHY
RETINOPATHY no 206 yes 194 Name: count, dtype: int64
ax = ValueCounts_RETINOPATHY.plot(kind='bar',figsize=(10,6), width=0.40 ,color=['green','brown'] ,fontsize=15, title='A Bar graph showing values of RETINOPATHY column' )
ax.set_xlabel("RETINOPATHY",fontsize=15)
ax.set_ylabel("Count",fontsize=15)
plt.show()
#correlation matrix to see how variables are correlated
df1 = df.copy()
df['RETINOPATHY'] = df['RETINOPATHY'].astype('category').cat.codes
correlation_matrix = df.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()
#Features
X = df.loc[:,df.columns != 'RETINOPATHY']
#Response
y = df['RETINOPATHY']
estimators = [
('encoder', TargetEncoder()),
('clf', XGBClassifier(random_state=42))
]
XGB_pipe = Pipeline(steps=estimators)
XGB_pipe
Pipeline(steps=[('encoder', TargetEncoder()),
('clf',
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None,
early_stopping_rounds=None,
enable_categorical=False, eval_metric=None,
feature_types=None, gamma=None, gpu_id=None,
grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None,
max_bin=None, max_cat_threshold=None,
max_cat_to_onehot=None, max_delta_step=None,
max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan,
monotone_constraints=None, n_estimators=100,
n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=42, ...))])
search_space_XGB = {
'clf__max_depth': Integer(2,8),
'clf__learning_rate': Real(0.001, 1.0, prior='log-uniform'),
'clf__subsample': Real(0.5, 1.0),
'clf__colsample_bytree': Real(0.5, 1.0),
'clf__colsample_bylevel': Real(0.5, 1.0),
'clf__colsample_bynode' : Real(0.5, 1.0),
'clf__reg_alpha': Real(0.0, 10.0),
'clf__reg_lambda': Real(0.0, 10.0),
'clf__gamma': Real(0.0, 10.0)
}
BayesSearch_model = BayesSearchCV(XGB_pipe, search_space_XGB, cv=10, n_iter=10, scoring='roc_auc', random_state=42)
BayesSearch_model.fit(X, y)
Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data. Warning: No categorical columns found. Calling 'transform' will only return input data.
BayesSearchCV(cv=10,
estimator=Pipeline(steps=[('encoder', TargetEncoder()),
('clf',
XGBClassifier(base_score=None,
booster=None,
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None,
early_stopping_rounds=None,
enable_categorical=False,
eval_metric=None,
feature_types=None,
gamma=None, gpu_id=None,
grow_policy=None,
importance_type=...
'clf__learning_rate': Real(low=0.001, high=1.0, prior='log-uniform', transform='normalize'),
'clf__max_depth': Integer(low=2, high=8, prior='uniform', transform='normalize'),
'clf__reg_alpha': Real(low=0.0, high=10.0, prior='uniform', transform='normalize'),
'clf__reg_lambda': Real(low=0.0, high=10.0, prior='uniform', transform='normalize'),
'clf__subsample': Real(low=0.5, high=1.0, prior='uniform', transform='normalize')})
BayesSearch_model.best_estimator_
Pipeline(steps=[('encoder', TargetEncoder(cols=[])),
('clf',
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=0.5018151536273716,
colsample_bynode=0.9089150098318758,
colsample_bytree=0.8706413721840136,
early_stopping_rounds=None,
enable_categorical=False, eval_metric=None,
feature_types=None, gamma=2.545542578640449,
gpu_id=None, g...cy=None,
importance_type=None,
interaction_constraints=None,
learning_rate=0.054449388745626506, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=4,
max_leaves=None, min_child_weight=None,
missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None,
num_parallel_tree=None, predictor=None,
random_state=42, ...))])
BayesSearch_model.score(X, y)
0.9709738764888399
y_pred_xgb = BayesSearch_model.predict(X)
y_pred_xgb
array([0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1,
0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1,
1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0,
1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1,
0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0,
0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0,
1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1,
0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1])
BayesSearch_model.predict_proba(X)
array([[0.9159406 , 0.08405942],
[0.9347741 , 0.06522588],
[0.90762603, 0.09237395],
[0.7210989 , 0.2789011 ],
[0.6661266 , 0.33387342],
[0.85130954, 0.14869048],
[0.76077884, 0.23922117],
[0.32195836, 0.67804164],
[0.0719052 , 0.9280948 ],
[0.7496879 , 0.25031212],
[0.89355147, 0.10644856],
[0.8176985 , 0.1823015 ],
[0.43081236, 0.56918764],
[0.9074761 , 0.09252388],
[0.23572987, 0.7642701 ],
[0.84991086, 0.15008917],
[0.60196483, 0.39803514],
[0.62962604, 0.370374 ],
[0.82853097, 0.17146905],
[0.91860735, 0.08139264],
[0.654192 , 0.345808 ],
[0.89926857, 0.10073143],
[0.91553867, 0.08446135],
[0.83361393, 0.16638605],
[0.9109591 , 0.08904088],
[0.05182678, 0.9481732 ],
[0.05014151, 0.9498585 ],
[0.85427827, 0.14572173],
[0.8636854 , 0.13631457],
[0.8472857 , 0.1527143 ],
[0.90959454, 0.09040543],
[0.40716934, 0.59283066],
[0.9048841 , 0.09511591],
[0.27935684, 0.72064316],
[0.28994614, 0.71005386],
[0.64354515, 0.35645488],
[0.79672915, 0.20327087],
[0.7396163 , 0.2603837 ],
[0.9185575 , 0.08144245],
[0.76517487, 0.23482515],
[0.05559379, 0.9444062 ],
[0.9348922 , 0.06510784],
[0.79616475, 0.20383526],
[0.9014248 , 0.0985752 ],
[0.90004486, 0.09995514],
[0.8653755 , 0.13462451],
[0.8776513 , 0.12234872],
[0.7327284 , 0.26727158],
[0.5067649 , 0.49323514],
[0.9014248 , 0.0985752 ],
[0.34517664, 0.65482336],
[0.09841406, 0.90158594],
[0.92910504, 0.07089499],
[0.79974407, 0.20025595],
[0.92020506, 0.07979495],
[0.8254826 , 0.17451736],
[0.05302805, 0.94697195],
[0.28618455, 0.71381545],
[0.9290936 , 0.07090642],
[0.89164215, 0.10835785],
[0.799416 , 0.20058398],
[0.23422283, 0.7657772 ],
[0.1910178 , 0.8089822 ],
[0.89674854, 0.10325145],
[0.7858851 , 0.2141149 ],
[0.20883596, 0.79116404],
[0.8578961 , 0.1421039 ],
[0.9328778 , 0.06712223],
[0.33403474, 0.66596526],
[0.7813122 , 0.2186878 ],
[0.90959454, 0.09040543],
[0.8897943 , 0.11020571],
[0.05133629, 0.9486637 ],
[0.85078216, 0.14921784],
[0.9139303 , 0.08606971],
[0.9290887 , 0.0709113 ],
[0.82948446, 0.17051552],
[0.8409793 , 0.15902072],
[0.05147833, 0.9485217 ],
[0.1506635 , 0.8493365 ],
[0.9068685 , 0.0931315 ],
[0.63629353, 0.36370644],
[0.91860735, 0.08139264],
[0.88384515, 0.11615487],
[0.17769027, 0.82230973],
[0.08321482, 0.9167852 ],
[0.24856222, 0.7514378 ],
[0.24352092, 0.7564791 ],
[0.05066121, 0.9493388 ],
[0.15689969, 0.8431003 ],
[0.21649998, 0.7835 ],
[0.75181174, 0.24818826],
[0.41760963, 0.58239037],
[0.2723127 , 0.7276873 ],
[0.92867243, 0.0713276 ],
[0.9290936 , 0.07090642],
[0.82387817, 0.17612182],
[0.7692871 , 0.23071289],
[0.889357 , 0.110643 ],
[0.57010436, 0.4298956 ],
[0.07080883, 0.9291912 ],
[0.8815224 , 0.1184776 ],
[0.7299721 , 0.27002788],
[0.12595779, 0.8740422 ],
[0.05014151, 0.9498585 ],
[0.87514096, 0.12485906],
[0.06438941, 0.9356106 ],
[0.73258245, 0.26741758],
[0.8276973 , 0.17230274],
[0.15602791, 0.8439721 ],
[0.79193413, 0.20806587],
[0.26170552, 0.7382945 ],
[0.05016083, 0.9498392 ],
[0.8839821 , 0.11601787],
[0.05080539, 0.9491946 ],
[0.77196634, 0.22803365],
[0.17029595, 0.82970405],
[0.16115755, 0.83884245],
[0.59573984, 0.40426013],
[0.9060386 , 0.09396144],
[0.1588366 , 0.8411634 ],
[0.24245691, 0.7575431 ],
[0.91153264, 0.08846736],
[0.8574309 , 0.14256912],
[0.93449986, 0.06550013],
[0.9068778 , 0.09312219],
[0.16961414, 0.83038586],
[0.06611097, 0.93388903],
[0.8226313 , 0.1773687 ],
[0.26932853, 0.73067147],
[0.7254001 , 0.2745999 ],
[0.22129309, 0.7787069 ],
[0.19461 , 0.80539 ],
[0.05428356, 0.94571644],
[0.06933236, 0.93066764],
[0.9112455 , 0.0887545 ],
[0.7455321 , 0.2544679 ],
[0.0710606 , 0.9289394 ],
[0.23598462, 0.7640154 ],
[0.9088317 , 0.09116831],
[0.86800444, 0.13199553],
[0.90133274, 0.09866725],
[0.1583904 , 0.8416096 ],
[0.7296417 , 0.2703583 ],
[0.05014151, 0.9498585 ],
[0.06366545, 0.93633455],
[0.15186268, 0.8481373 ],
[0.29219663, 0.70780337],
[0.39206904, 0.60793096],
[0.93024874, 0.06975124],
[0.05405951, 0.9459405 ],
[0.23084444, 0.76915556],
[0.7709879 , 0.22901207],
[0.7691813 , 0.23081867],
[0.0704447 , 0.9295553 ],
[0.8937997 , 0.10620029],
[0.20741552, 0.7925845 ],
[0.1506635 , 0.8493365 ],
[0.23640728, 0.7635927 ],
[0.88737947, 0.11262053],
[0.5729968 , 0.4270032 ],
[0.7975131 , 0.2024869 ],
[0.8582889 , 0.14171112],
[0.81704086, 0.18295915],
[0.9279438 , 0.0720562 ],
[0.8901877 , 0.10981231],
[0.4103334 , 0.5896666 ],
[0.8827275 , 0.11727251],
[0.7945082 , 0.20549181],
[0.29684222, 0.7031578 ],
[0.8205995 , 0.17940052],
[0.8673048 , 0.13269518],
[0.05215913, 0.94784087],
[0.8807451 , 0.11925489],
[0.19487637, 0.8051236 ],
[0.05167973, 0.94832027],
[0.8922309 , 0.10776908],
[0.83983535, 0.16016464],
[0.1584177 , 0.8415823 ],
[0.82005364, 0.17994636],
[0.9373062 , 0.06269377],
[0.9384989 , 0.06150111],
[0.05145848, 0.9485415 ],
[0.75226545, 0.24773452],
[0.8041566 , 0.19584338],
[0.8464549 , 0.15354508],
[0.90228707, 0.09771292],
[0.9014248 , 0.0985752 ],
[0.05464977, 0.9453502 ],
[0.9347741 , 0.06522588],
[0.05267966, 0.94732034],
[0.8158641 , 0.18413594],
[0.25624412, 0.7437559 ],
[0.88742065, 0.11257932],
[0.31868452, 0.6813155 ],
[0.94067043, 0.05932958],
[0.7637472 , 0.23625281],
[0.87514096, 0.12485906],
[0.9271794 , 0.07282058],
[0.05195409, 0.9480459 ],
[0.87514096, 0.12485906],
[0.8602711 , 0.13972893],
[0.8385648 , 0.16143519],
[0.47016233, 0.52983767],
[0.28980905, 0.71019095],
[0.8802105 , 0.11978945],
[0.7079587 , 0.29204127],
[0.81930816, 0.18069185],
[0.37690485, 0.62309515],
[0.8856661 , 0.11433395],
[0.898845 , 0.10115501],
[0.21823072, 0.7817693 ],
[0.90693164, 0.09306833],
[0.86892974, 0.13107023],
[0.15958703, 0.840413 ],
[0.852164 , 0.14783601],
[0.8846628 , 0.11533719],
[0.22258055, 0.77741945],
[0.3657711 , 0.6342289 ],
[0.91945225, 0.08054775],
[0.17980057, 0.82019943],
[0.9030054 , 0.09699456],
[0.8956448 , 0.10435523],
[0.15212119, 0.8478788 ],
[0.8252737 , 0.17472629],
[0.16396964, 0.83603036],
[0.8326255 , 0.16737448],
[0.21130359, 0.7886964 ],
[0.5993699 , 0.40063015],
[0.7835018 , 0.21649818],
[0.9290936 , 0.07090642],
[0.16226268, 0.8377373 ],
[0.3579675 , 0.6420325 ],
[0.05257756, 0.94742244],
[0.90945137, 0.09054866],
[0.5586313 , 0.44136873],
[0.1221782 , 0.8778218 ],
[0.7455084 , 0.25449163],
[0.8291218 , 0.1708782 ],
[0.8291218 , 0.1708782 ],
[0.49493462, 0.5050654 ],
[0.21715754, 0.78284246],
[0.9107016 , 0.08929841],
[0.6559675 , 0.3440325 ],
[0.16038609, 0.8396139 ],
[0.91200125, 0.08799873],
[0.61614645, 0.38385355],
[0.90812516, 0.09187485],
[0.6001161 , 0.39988393],
[0.5751736 , 0.42482638],
[0.8715712 , 0.12842885],
[0.9371074 , 0.06289263],
[0.8878125 , 0.11218753],
[0.8009299 , 0.19907011],
[0.9076819 , 0.09231814],
[0.9290936 , 0.07090642],
[0.89096355, 0.10903645],
[0.85356677, 0.14643323],
[0.84184605, 0.15815395],
[0.8451279 , 0.1548721 ],
[0.518322 , 0.48167804],
[0.47100604, 0.52899396],
[0.13435495, 0.86564505],
[0.0570454 , 0.9429546 ],
[0.0586462 , 0.9413538 ],
[0.05267966, 0.94732034],
[0.05072117, 0.94927883],
[0.57444966, 0.4255503 ],
[0.06259328, 0.9374067 ],
[0.8578149 , 0.14218512],
[0.80115074, 0.19884926],
[0.89278907, 0.10721091],
[0.13105583, 0.86894417],
[0.9271794 , 0.07282058],
[0.1502406 , 0.8497594 ],
[0.7419231 , 0.25807694],
[0.3130831 , 0.6869169 ],
[0.9373062 , 0.06269377],
[0.5794816 , 0.42051837],
[0.19088453, 0.80911547],
[0.37972552, 0.6202745 ],
[0.55993795, 0.44006202],
[0.24245691, 0.7575431 ],
[0.90959454, 0.09040543],
[0.05464143, 0.9453586 ],
[0.86892974, 0.13107023],
[0.80613214, 0.19386785],
[0.5425221 , 0.45747793],
[0.79438645, 0.20561354],
[0.9251341 , 0.07486589],
[0.8793853 , 0.12061471],
[0.61082053, 0.38917947],
[0.90929294, 0.09070705],
[0.34005004, 0.65994996],
[0.8366307 , 0.1633693 ],
[0.8835045 , 0.11649548],
[0.9135427 , 0.08645729],
[0.07861257, 0.92138743],
[0.05669612, 0.9433039 ],
[0.37170684, 0.62829316],
[0.06595111, 0.9340489 ],
[0.9014248 , 0.0985752 ],
[0.7674445 , 0.2325555 ],
[0.5398712 , 0.46012878],
[0.8946368 , 0.1053632 ],
[0.6668243 , 0.33317572],
[0.92403066, 0.07596937],
[0.87687397, 0.12312602],
[0.6055937 , 0.39440635],
[0.8929237 , 0.10707629],
[0.28222823, 0.71777177],
[0.20779836, 0.79220164],
[0.3082357 , 0.6917643 ],
[0.235111 , 0.764889 ],
[0.87514096, 0.12485906],
[0.49959022, 0.5004098 ],
[0.33759373, 0.66240627],
[0.89278907, 0.10721091],
[0.22654301, 0.773457 ],
[0.8934023 , 0.10659771],
[0.30257535, 0.69742465],
[0.1506635 , 0.8493365 ],
[0.9093935 , 0.09060652],
[0.86356986, 0.13643013],
[0.86379385, 0.13620618],
[0.7551364 , 0.24486361],
[0.5288719 , 0.47112808],
[0.9024638 , 0.0975362 ],
[0.9373062 , 0.06269377],
[0.34499276, 0.65500724],
[0.8543612 , 0.14563882],
[0.8202817 , 0.17971832],
[0.6000211 , 0.39997888],
[0.23394 , 0.76606 ],
[0.26658386, 0.73341614],
[0.16834724, 0.83165276],
[0.23725396, 0.76274604],
[0.16834724, 0.83165276],
[0.9219912 , 0.07800874],
[0.23725396, 0.76274604],
[0.05518788, 0.9448121 ],
[0.5737823 , 0.4262177 ],
[0.26408565, 0.73591435],
[0.25023752, 0.7497625 ],
[0.29056966, 0.70943034],
[0.25150794, 0.74849206],
[0.20714223, 0.79285777],
[0.3175525 , 0.6824475 ],
[0.06689066, 0.93310934],
[0.05312383, 0.94687617],
[0.05066121, 0.9493388 ],
[0.0691191 , 0.9308809 ],
[0.05405951, 0.9459405 ],
[0.05213898, 0.947861 ],
[0.07964021, 0.9203598 ],
[0.06867141, 0.9313286 ],
[0.05422586, 0.94577414],
[0.05147791, 0.9485221 ],
[0.06841481, 0.9315852 ],
[0.05467391, 0.9453261 ],
[0.21714294, 0.78285706],
[0.05080539, 0.9491946 ],
[0.05431563, 0.9456844 ],
[0.05147791, 0.9485221 ],
[0.05580753, 0.94419247],
[0.06867141, 0.9313286 ],
[0.06867141, 0.9313286 ],
[0.12765038, 0.8723496 ],
[0.05602992, 0.9439701 ],
[0.06890565, 0.93109435],
[0.05066121, 0.9493388 ],
[0.06884509, 0.9311549 ],
[0.1221782 , 0.8778218 ],
[0.1451546 , 0.8548454 ],
[0.05702823, 0.94297177],
[0.07088178, 0.9291182 ],
[0.09625357, 0.9037464 ],
[0.14083982, 0.8591602 ],
[0.14083982, 0.8591602 ],
[0.17035562, 0.8296444 ],
[0.1414442 , 0.8585558 ],
[0.07248467, 0.9275153 ],
[0.17376035, 0.82623965],
[0.15597749, 0.8440225 ],
[0.13108039, 0.8689196 ],
[0.12963802, 0.870362 ],
[0.15184557, 0.8481544 ],
[0.8202817 , 0.17971832],
[0.16834724, 0.83165276],
[0.23725396, 0.76274604],
[0.16834724, 0.83165276],
[0.05312383, 0.94687617],
[0.13422734, 0.86577266],
[0.05213898, 0.947861 ],
[0.05080539, 0.9491946 ],
[0.06933236, 0.93066764],
[0.05014151, 0.9498585 ],
[0.06366545, 0.93633455],
[0.20883596, 0.79116404],
[0.05312383, 0.94687617]], dtype=float32)
BayesSearch_model.best_estimator_.steps
[('encoder', TargetEncoder(cols=[])),
('clf',
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=0.5018151536273716,
colsample_bynode=0.9089150098318758,
colsample_bytree=0.8706413721840136, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=2.545542578640449, gpu_id=None, grow_policy=None,
importance_type=None, interaction_constraints=None,
learning_rate=0.054449388745626506, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=4, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=42, ...))]
from xgboost import plot_importance
xgboost_step = BayesSearch_model.best_estimator_.steps[1]
xgboost_model = xgboost_step[1]
plot_importance(xgboost_model)
<Axes: title={'center': 'Feature importance'}, xlabel='F score', ylabel='Features'>
df.columns
Index(['Gender', 'Age', 'BMI', 'Duration in years', 'Sodium Level',
'Potassium Level', 'Createnine level', 'uric acid', 'HBA1C',
'Urea level', 'eGFR', 'protien level', 'ALBUMIN', 'ALT', 'AST',
'calcium', 'HDL', 'LDL', 'Maculopathy', 'Hypertension', 'Dyslipidemia',
'Cardiovascular Disease', 'Diabetic Neuropathy',
'Diabetic Macular Edema', 'Chronic Kidney Disease', 'RETINOPATHY'],
dtype='object')
It can be observed that Diabetic Macular Edema, Maculopathy, Duration in years, Hypertension and HBA1C are the top five features given by XGBoost classifer.
# Calculate the accuracy of the XGBoost Classifier classifier
accu_xgb = accuracy_score(y_pred_xgb, y)
# Print the prediction accuracy
print('Test accuracy XGboost Classifier :', accu_xgb)
Test accuracy XGboost Classifier : 0.885
encoding_data = {'No':0, 'Yes':1}
cm_xgb = confusion_matrix(y, y_pred_xgb)
plt.figure(figsize=(7, 7))
sns.heatmap(cm_xgb, annot=True, vmin=0, fmt='g', cbar=False, cmap='viridis')
plt.xticks(np.arange(2) + 0.5, encoding_data.keys())
plt.yticks(np.arange(2) + 0.5, encoding_data.keys())
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix XGB Model")
plt.show()
cm_xgb
array([[187, 19],
[ 27, 167]], dtype=int64)
# Calculate the classification report for the XGB model, including precision, recall, F1-score, and support
clr_xgb = classification_report(y, y_pred_xgb, target_names=encoding_data.keys())
# Print the classification report
print("Classification Report XGBoos classifier:\n----------------------\n", clr_xgb)
Classification Report XGBoos classifier:
----------------------
precision recall f1-score support
No 0.87 0.91 0.89 206
Yes 0.90 0.86 0.88 194
accuracy 0.89 400
macro avg 0.89 0.88 0.88 400
weighted avg 0.89 0.89 0.88 400
Data Preprocessing:
Model Building:
Model Evaluation:
Results:
# Calculate the correlation coefficients
correlation_matrix = df.corr()['RETINOPATHY'].abs().sort_values(ascending=False)
print("Feature Weights using Top 10 Correlation Based Features:")
print(correlation_matrix.drop('RETINOPATHY')[:10])
# Select the top correlated features (excluding the target(RETINOPATHY) variable itself)
top_10_features = correlation_matrix.drop('RETINOPATHY').index[:10] # Selecting top 10 features
# Extract the selected features from the dataset
corr_based_10_feat_df = df[top_10_features]
# Print the selected features
print("Selected Features:")
corr_based_10_feat_df.head()
Feature Weights using Top 10 Correlation Based Features: Maculopathy 0.641321 Diabetic Macular Edema 0.571131 Hypertension 0.372317 Chronic Kidney Disease 0.297239 Duration in years 0.291018 Urea level 0.253256 eGFR 0.248313 Createnine level 0.238512 HBA1C 0.228776 Gender 0.204448 Name: RETINOPATHY, dtype: float64 Selected Features:
| Maculopathy | Diabetic Macular Edema | Hypertension | Chronic Kidney Disease | Duration in years | Urea level | eGFR | Createnine level | HBA1C | Gender | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 18 | 4.64 | 67.0 | 79.0 | 11.2 | 0 |
| 1 | 0 | 0 | 0 | 0 | 16 | 3.51 | 100.6 | 47.0 | 7.5 | 0 |
| 2 | 0 | 0 | 1 | 0 | 22 | 4.78 | 93.9 | 46.8 | 5.9 | 0 |
| 3 | 0 | 0 | 1 | 1 | 8 | 7.60 | 35.0 | 172.0 | 7.8 | 1 |
| 4 | 1 | 0 | 0 | 0 | 12 | 4.50 | 104.4 | 44.7 | 7.1 | 0 |
# Initialize the XGB Classifier
model_corr_10 = XGBClassifier(random_state=42)
# Define the cross-validation strategy (k-fold)
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
# Perform cross-validation
accuracy_scores_corr_10 = cross_val_score(model_corr_10, corr_based_10_feat_df, df['RETINOPATHY'], cv=kfold, scoring='accuracy')
f1_scores_corr_10 = cross_val_score(model_corr_10, corr_based_10_feat_df, df['RETINOPATHY'], cv=kfold, scoring='f1_macro')
# Print the mean accuracy and F1 scores
print("Mean Accuracy with correlation based top 10 feats:", accuracy_scores_corr_10.mean())
print("Mean F1 Score with correlation based top 10 feats:", f1_scores_corr_10.mean())
Mean Accuracy with correlation based top 10 feats: 0.8525 Mean F1 Score with correlation based top 10 feats: 0.8511349187341976
# Calculate the correlation coefficients
correlation_matrix = df.corr()['RETINOPATHY'].abs().sort_values(ascending=False)
print("Feature Weights using Top 5 Correlation Based Features:")
print(correlation_matrix.drop('RETINOPATHY')[:5])
# Select the top correlated features (excluding the target(RETINOPATHY) variable itself)
top_5_features = correlation_matrix.drop('RETINOPATHY').index[:5] # Selecting top 10 features
# Extract the selected features from the dataset
corr_based_5_feat_df = df[top_5_features]
# Print the selected features
print("Selected Features:")
corr_based_5_feat_df.head()
Feature Weights using Top 5 Correlation Based Features: Maculopathy 0.641321 Diabetic Macular Edema 0.571131 Hypertension 0.372317 Chronic Kidney Disease 0.297239 Duration in years 0.291018 Name: RETINOPATHY, dtype: float64 Selected Features:
| Maculopathy | Diabetic Macular Edema | Hypertension | Chronic Kidney Disease | Duration in years | |
|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 18 |
| 1 | 0 | 0 | 0 | 0 | 16 |
| 2 | 0 | 0 | 1 | 0 | 22 |
| 3 | 0 | 0 | 1 | 1 | 8 |
| 4 | 1 | 0 | 0 | 0 | 12 |
# Initialize the XGB Classifier
model_corr_5 = XGBClassifier(random_state=42)
# Define the cross-validation strategy (k-fold)
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
# Perform cross-validation
accuracy_scores_corr_5 = cross_val_score(model_corr_5, corr_based_5_feat_df, df['RETINOPATHY'], cv=kfold, scoring='accuracy')
f1_scores_corr_5 = cross_val_score(model_corr_10, corr_based_5_feat_df, df['RETINOPATHY'], cv=kfold, scoring='f1_macro')
# Print the mean accuracy and F1 scores
print("Mean Accuracy with correlation based top 5 feats:", accuracy_scores_corr_5.mean())
print("Mean F1 Score with correlation based top 5 feats:", f1_scores_corr_5.mean())
Mean Accuracy with correlation based top 5 feats: 0.8775000000000001 Mean F1 Score with correlation based top 5 feats: 0.8759051215129505
# Define the number of features (k) to select
k = 10
# Initialize SelectKBest with the f_classif scoring function
Select_10_Best = SelectKBest(score_func=f_classif, k=k)
# Fit selector to the data and transform the features
X_selected_10 = Select_10_Best.fit_transform(df.drop(columns=['RETINOPATHY']), df['RETINOPATHY'])
# Get the indices of the selected features
selected_indices = Select_10_Best.get_support(indices=True)
# Get the names of the selected features
selected_features_10 = df.drop(columns=['RETINOPATHY']).columns[selected_indices]
# Create a DataFrame with the selected features
selected_features_df_10 = pd.DataFrame(X_selected_10, columns=selected_features_10)
# Get the scores of the selected features
selected_scores_10 = Select_10_Best.scores_[selected_indices]
# Create a DataFrame with the selected features and their scores
selected_features_Scores_df_10 = pd.DataFrame({'Feature': selected_features_10, 'Score': selected_scores_10})
selected_features_Scores_df_10 = selected_features_Scores_df_10.sort_values(by='Score', ascending = False)
# Print the selected features along with their scores
print("Feature Weights(scores) using Top 10 information gain Based Features:")
print(selected_features_Scores_df_10)
# Print the selected features
print("Selected top 10 best Features:")
selected_features_df_10.head()
Feature Weights(scores) using Top 10 information gain Based Features:
Feature Score
6 Maculopathy 278.057184
8 Diabetic Macular Edema 192.671800
7 Hypertension 64.049290
9 Chronic Kidney Disease 38.571600
1 Duration in years 36.826090
4 Urea level 27.276615
5 eGFR 26.153104
2 Createnine level 24.007025
3 HBA1C 21.981123
0 Gender 17.361711
Selected top 10 best Features:
| Gender | Duration in years | Createnine level | HBA1C | Urea level | eGFR | Maculopathy | Hypertension | Diabetic Macular Edema | Chronic Kidney Disease | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | 18.0 | 79.0 | 11.2 | 4.64 | 67.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | 0.0 | 16.0 | 47.0 | 7.5 | 3.51 | 100.6 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 0.0 | 22.0 | 46.8 | 5.9 | 4.78 | 93.9 | 0.0 | 1.0 | 0.0 | 0.0 |
| 3 | 1.0 | 8.0 | 172.0 | 7.8 | 7.60 | 35.0 | 0.0 | 1.0 | 0.0 | 1.0 |
| 4 | 0.0 | 12.0 | 44.7 | 7.1 | 4.50 | 104.4 | 1.0 | 0.0 | 0.0 | 0.0 |
# Initialize the XGB Classifier
model_Select_10 = XGBClassifier(random_state=42)
# Define the cross-validation strategy (k-fold)
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
# Perform cross-validation
accuracy_scores_select_10 = cross_val_score(model_Select_10, selected_features_df_10, df['RETINOPATHY'], cv=kfold, scoring='accuracy')
f1_scores_select_10 = cross_val_score(model_Select_10, selected_features_df_10, df['RETINOPATHY'], cv=kfold, scoring='f1_macro')
# Print the mean accuracy and F1 scores
print("Mean Accuracy with gain based top 10 feats:", accuracy_scores_select_10.mean())
print("Mean F1 Score with gain based top 10 feats:", f1_scores_select_10.mean())
Mean Accuracy with gain based top 10 feats: 0.8625 Mean F1 Score with gain based top 10 feats: 0.8613261471343492
# Define the number of features (k) to select
k = 5
# Initialize SelectKBest with the f_classif scoring function
Select_5_Best = SelectKBest(score_func=f_classif, k=k)
# Fit selector to the data and transform the features
X_selected_5 = Select_5_Best.fit_transform(df.drop(columns=['RETINOPATHY']), df['RETINOPATHY'])
# Get the indices of the selected features
selected_indices = Select_5_Best.get_support(indices=True)
# Get the names of the selected features
selected_features_5 = df.drop(columns=['RETINOPATHY']).columns[selected_indices]
# Create a DataFrame with the selected features
selected_features_df_5 = pd.DataFrame(X_selected_5, columns=selected_features_5)
# Get the scores of the selected features
selected_scores_5 = Select_5_Best.scores_[selected_indices]
# Create a DataFrame with the selected features and their scores
selected_features_Scores_df_5 = pd.DataFrame({'Feature': selected_features_5, 'Score': selected_scores_5})
selected_features_Scores_df_5 = selected_features_Scores_df_5.sort_values(by='Score', ascending = False)
# Print the selected features along with their scores
print("Feature Weights(scores) using Top 5 information gain Based Features:")
print(selected_features_Scores_df_5)
# Print the selected features
print("Selected 5 best Features:")
selected_features_df_5.head()
Feature Weights(scores) using Top 5 information gain Based Features:
Feature Score
1 Maculopathy 278.057184
3 Diabetic Macular Edema 192.671800
2 Hypertension 64.049290
4 Chronic Kidney Disease 38.571600
0 Duration in years 36.826090
Selected 5 best Features:
| Duration in years | Maculopathy | Hypertension | Diabetic Macular Edema | Chronic Kidney Disease | |
|---|---|---|---|---|---|
| 0 | 18.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | 16.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 22.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 3 | 8.0 | 0.0 | 1.0 | 0.0 | 1.0 |
| 4 | 12.0 | 1.0 | 0.0 | 0.0 | 0.0 |
# Initialize the XGB Classifier
model_Select_5 = XGBClassifier(random_state=42)
# Define the cross-validation strategy (k-fold)
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
# Perform cross-validation
accuracy_scores_select_5 = cross_val_score(model_Select_5, selected_features_df_5, df['RETINOPATHY'], cv=kfold, scoring='accuracy')
f1_scores_select_5 = cross_val_score(model_Select_5, selected_features_df_5, df['RETINOPATHY'], cv=kfold, scoring='f1_macro')
# Print the mean accuracy and F1 scores
print("Mean Accuracy with gain based top 5 feats:", accuracy_scores_select_5.mean())
print("Mean F1 Score with gain based top 5 feats:", f1_scores_select_5.mean())
Mean Accuracy with gain based top 5 feats: 0.8775000000000001 Mean F1 Score with gain based top 5 feats: 0.8759051215129505
# Initialize an XGBoost Classifier
xgb_classifier_10 = XGBClassifier()
# Fit the classifier to the data
xgb_classifier_10.fit(df.drop(columns=['RETINOPATHY']), df['RETINOPATHY'])
# Initialize SelectFromModel with the trained classifier
selector_10 = SelectFromModel(xgb_classifier_10, prefit=True, threshold=-np.inf, max_features=10)
# Transform the features to select the top features
X_selected_10 = selector_10.transform(df.drop(columns=['RETINOPATHY']))
# Get the indices of the selected features
selected_indices = selector_10.get_support(indices=True)
# Get the names of the selected features
selected_features_top_10 = df.drop(columns=['RETINOPATHY']).columns[selected_indices]
# Create a DataFrame with the selected features
selected_features_df_10 = pd.DataFrame(X_selected_10, columns=selected_features_top_10)
# Get the feature importances
feature_importances = xgb_classifier_10.feature_importances_
# Create a DataFrame with the selected features and their feature importances
selected_features_imp_10 = pd.DataFrame({'Feature': selected_features_top_10, 'Feature Importance': feature_importances[selected_indices]})
selected_features_imp_10 = selected_features_imp_10.sort_values(by='Feature Importance', ascending = False)
# Print the selected features along with their feature importances
print("Feature Weights(importance) using Top 10 learner Based Features::")
print(selected_features_imp_10)
# Print the selected features
print("Top 10 Selected learner based Features:")
selected_features_df_10.head()
Feature Weights(importance) using Top 10 learner Based Features::
Feature Feature Importance
8 Diabetic Macular Edema 0.342641
6 Maculopathy 0.258450
9 Chronic Kidney Disease 0.049784
1 Duration in years 0.041653
7 Hypertension 0.037411
0 Age 0.025518
4 ALBUMIN 0.024334
5 LDL 0.021583
2 Sodium Level 0.019305
3 HBA1C 0.019156
Top 10 Selected learner based Features:
| Age | Duration in years | Sodium Level | HBA1C | ALBUMIN | LDL | Maculopathy | Hypertension | Diabetic Macular Edema | Chronic Kidney Disease | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 66.0 | 18.0 | 138.0 | 11.2 | 39.5 | 2.14 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | 62.0 | 16.0 | 140.0 | 7.5 | 37.3 | 0.85 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 75.0 | 22.0 | 144.0 | 5.9 | 39.8 | 2.26 | 0.0 | 1.0 | 0.0 | 0.0 |
| 3 | 63.0 | 8.0 | 136.0 | 7.8 | 35.5 | 2.11 | 0.0 | 1.0 | 0.0 | 1.0 |
| 4 | 62.0 | 12.0 | 140.0 | 7.1 | 36.9 | 4.09 | 1.0 | 0.0 | 0.0 | 0.0 |
# Initialize the XGB Classifier
model_learner_10 = XGBClassifier(random_state=42)
# Define the cross-validation strategy (k-fold)
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
# Perform cross-validation
accuracy_scores_learner_10 = cross_val_score(model_learner_10, selected_features_df_10, df['RETINOPATHY'], cv=kfold, scoring='accuracy')
f1_scores_learner_10 = cross_val_score(model_learner_10, selected_features_df_10, df['RETINOPATHY'], cv=kfold, scoring='f1_macro')
# Print the mean accuracy and F1 scores
print("Mean Accuracy with Learner based top 10 feats:", accuracy_scores_learner_10.mean())
print("Mean F1 Score with Learner based top 10 feats:", f1_scores_learner_10.mean())
Mean Accuracy with Learner based top 10 feats: 0.8925000000000001 Mean F1 Score with Learner based top 10 feats: 0.8912106971283578
# Initialize an XGBoost Classifier
xgb_classifier_5 = XGBClassifier()
# Fit the classifier to the data
xgb_classifier_5.fit(df.drop(columns=['RETINOPATHY']), df['RETINOPATHY'])
# Initialize SelectFromModel with the trained classifier
selector_5 = SelectFromModel(xgb_classifier_5, prefit=True, threshold=-np.inf, max_features=5)
# Transform the features to select the top features
X_selected_5 = selector_5.transform(df.drop(columns=['RETINOPATHY']))
# Get the indices of the selected features
selected_indices = selector_5.get_support(indices=True)
# Get the names of the selected features
selected_features_top_5 = df.drop(columns=['RETINOPATHY']).columns[selected_indices]
# Create a DataFrame with the selected features
selected_features_df_5 = pd.DataFrame(X_selected_5, columns=selected_features_top_5)
# Get the feature importances
feature_importances = xgb_classifier_5.feature_importances_
# Create a DataFrame with the selected features and their feature importances
selected_features_imp_5 = pd.DataFrame({'Feature': selected_features_top_5, 'Feature Importance': feature_importances[selected_indices]})
selected_features_imp_5 = selected_features_imp_5.sort_values(by='Feature Importance', ascending = False)
# Print the selected features along with their feature importances
print("Feature Weights(importance) using Top 5 learner Based Features::")
print(selected_features_imp_5)
# Print the selected features
print("Top 5 Selected learner based Features:")
selected_features_df_5.head()
Feature Weights(importance) using Top 5 learner Based Features::
Feature Feature Importance
3 Diabetic Macular Edema 0.342641
1 Maculopathy 0.258450
4 Chronic Kidney Disease 0.049784
0 Duration in years 0.041653
2 Hypertension 0.037411
Top 5 Selected learner based Features:
| Duration in years | Maculopathy | Hypertension | Diabetic Macular Edema | Chronic Kidney Disease | |
|---|---|---|---|---|---|
| 0 | 18.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | 16.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 22.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 3 | 8.0 | 0.0 | 1.0 | 0.0 | 1.0 |
| 4 | 12.0 | 1.0 | 0.0 | 0.0 | 0.0 |
# Initialize the XGB Classifier
model_learner_5 = XGBClassifier(random_state=42)
# Define the cross-validation strategy (k-fold)
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
# Perform cross-validation
accuracy_scores_learner_5 = cross_val_score(model_learner_5, selected_features_df_5, df['RETINOPATHY'], cv=kfold, scoring='accuracy')
f1_scores_learner_5 = cross_val_score(model_learner_5, selected_features_df_5, df['RETINOPATHY'], cv=kfold, scoring='f1_macro')
# Print the mean accuracy and F1 scores
print("Mean Accuracy with Learner based top 5 feats:", accuracy_scores_learner_5.mean())
print("Mean F1 Score with Learner based top 5 feats:", f1_scores_learner_5.mean())
Mean Accuracy with Learner based top 5 feats: 0.8775000000000001 Mean F1 Score with Learner based top 5 feats: 0.8759051215129505
To compare of feature selection methods using XGBoost with top 10 and top 5 features, and it compares the results to XGBoost using the full dataset features. Here is the summary and interpretation of the outputs shared:
Comparison with Full Dataset:
Training XGBoost on the full dataset yielded a mean accuracy of approximately 85.00% and a mean F1 score of around 85.00%. Comparing these results with the feature selection methods, it's evident that selecting relevant features can lead to improved model performance in terms of accuracy and F1 score.
Interpretation:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.frequent_patterns import association_rules
df_binary =df.loc[:,['Gender','Maculopathy','Hypertension','Dyslipidemia','Cardiovascular Disease','Diabetic Neuropathy','Diabetic Macular Edema','Chronic Kidney Disease','RETINOPATHY']]
df_binary.head()
| Gender | Maculopathy | Hypertension | Dyslipidemia | Cardiovascular Disease | Diabetic Neuropathy | Diabetic Macular Edema | Chronic Kidney Disease | RETINOPATHY | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 2 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 |
| 3 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
df_binary_bool = df_binary.astype(bool)
frequent_itemsets = fpgrowth(df_binary_bool, min_support=0.2, use_colnames=True, verbose = 1)
7 itemset(s) from tree conditioned on items () 1 itemset(s) from tree conditioned on items (Dyslipidemia) 0 itemset(s) from tree conditioned on items (Hypertension) 2 itemset(s) from tree conditioned on items (Gender) 0 itemset(s) from tree conditioned on items (Gender, Hypertension) 1 itemset(s) from tree conditioned on items (Gender, RETINOPATHY) 3 itemset(s) from tree conditioned on items (Chronic Kidney Disease) 3 itemset(s) from tree conditioned on items (Maculopathy) 2 itemset(s) from tree conditioned on items (Maculopathy, Dyslipidemia) 0 itemset(s) from tree conditioned on items (Maculopathy, Dyslipidemia, RETINOPATHY) 1 itemset(s) from tree conditioned on items (Maculopathy, Dyslipidemia, Hypertension) 0 itemset(s) from tree conditioned on items (Maculopathy, Hypertension) 1 itemset(s) from tree conditioned on items (Maculopathy, RETINOPATHY) 2 itemset(s) from tree conditioned on items (RETINOPATHY) 0 itemset(s) from tree conditioned on items (RETINOPATHY, Hypertension) 1 itemset(s) from tree conditioned on items (RETINOPATHY, Dyslipidemia) 3 itemset(s) from tree conditioned on items (Diabetic Macular Edema)
print(frequent_itemsets)
support itemsets 0 0.5575 (Dyslipidemia) 1 0.8075 (Hypertension) 2 0.3675 (Gender) 3 0.2800 (Chronic Kidney Disease) 4 0.4400 (Maculopathy) 5 0.4850 (RETINOPATHY) 6 0.2350 (Diabetic Macular Edema) 7 0.4525 (Hypertension, Dyslipidemia) 8 0.3325 (Hypertension, Gender) 9 0.2275 (RETINOPATHY, Gender) 10 0.2225 (RETINOPATHY, Hypertension, Gender) 11 0.2750 (Chronic Kidney Disease, Hypertension) 12 0.2025 (Chronic Kidney Disease, RETINOPATHY) 13 0.2025 (Chronic Kidney Disease, RETINOPATHY, Hyperten... 14 0.2600 (Maculopathy, Dyslipidemia) 15 0.4100 (Hypertension, Maculopathy) 16 0.3725 (RETINOPATHY, Maculopathy) 17 0.2350 (RETINOPATHY, Maculopathy, Dyslipidemia) 18 0.2325 (Hypertension, Maculopathy, Dyslipidemia) 19 0.2225 (RETINOPATHY, Hypertension, Maculopathy, Dysli... 20 0.3600 (RETINOPATHY, Hypertension, Maculopathy) 21 0.4650 (RETINOPATHY, Hypertension) 22 0.3000 (RETINOPATHY, Dyslipidemia) 23 0.2800 (RETINOPATHY, Hypertension, Dyslipidemia) 24 0.2350 (RETINOPATHY, Diabetic Macular Edema) 25 0.2225 (Hypertension, Diabetic Macular Edema) 26 0.2225 (RETINOPATHY, Hypertension, Diabetic Macular E...
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)
def map_itemsets_with_presence(itemset):
item_labels = []
for item in itemset:
if 'Dyslipidemia' in item:
item_labels.append('Dyslipidemia: Present')
elif 'Hypertension' in item:
item_labels.append('Hypertension: Present')
elif 'Gender' in item:
item_labels.append('Gender: Present')
elif 'Chronic Kidney Disease' in item:
item_labels.append('Chronic Kidney Disease: Present')
elif 'Maculopathy' in item:
item_labels.append('Maculopathy: Present')
elif 'RETINOPATHY' in item:
item_labels.append('RETINOPATHY: Present')
elif 'Diabetic Macular Edema' in item:
item_labels.append('Diabetic Macular Edema: Present')
else:
item_labels.append('Absent')
return item_labels
# Apply the mapping function to the antecedents and consequents columns
rules['Antecedents'] = rules['antecedents'].apply(map_itemsets_with_presence)
rules['Consequents'] = rules['consequents'].apply(map_itemsets_with_presence)
print("\nAssociation Rules:")
print(rules)
Association Rules:
antecedents \
0 (Hypertension)
1 (Dyslipidemia)
2 (Gender)
3 (Gender)
4 (RETINOPATHY, Gender)
5 (Hypertension, Gender)
6 (Gender)
7 (Chronic Kidney Disease)
8 (Chronic Kidney Disease)
9 (Chronic Kidney Disease, RETINOPATHY)
10 (Chronic Kidney Disease, Hypertension)
11 (Chronic Kidney Disease)
12 (Maculopathy)
13 (Hypertension)
14 (Maculopathy)
15 (RETINOPATHY)
16 (Maculopathy)
17 (RETINOPATHY, Maculopathy)
18 (RETINOPATHY, Dyslipidemia)
19 (Maculopathy, Dyslipidemia)
20 (Maculopathy)
21 (Maculopathy, Hypertension)
22 (Hypertension, Dyslipidemia)
23 (Maculopathy, Dyslipidemia)
24 (Maculopathy)
25 (RETINOPATHY, Hypertension, Maculopathy)
26 (RETINOPATHY, Hypertension, Dyslipidemia)
27 (RETINOPATHY, Maculopathy, Dyslipidemia)
28 (Maculopathy, Hypertension, Dyslipidemia)
29 (RETINOPATHY, Maculopathy)
30 (RETINOPATHY, Dyslipidemia)
31 (Maculopathy, Hypertension)
32 (Maculopathy, Dyslipidemia)
33 (Maculopathy)
34 (RETINOPATHY, Hypertension)
35 (RETINOPATHY, Maculopathy)
36 (Maculopathy, Hypertension)
37 (RETINOPATHY)
38 (Maculopathy)
39 (RETINOPATHY)
40 (Hypertension)
41 (RETINOPATHY)
42 (Dyslipidemia)
43 (RETINOPATHY, Hypertension)
44 (RETINOPATHY, Dyslipidemia)
45 (Hypertension, Dyslipidemia)
46 (RETINOPATHY)
47 (Dyslipidemia)
48 (Diabetic Macular Edema)
49 (Diabetic Macular Edema)
50 (RETINOPATHY, Diabetic Macular Edema)
51 (Hypertension, Diabetic Macular Edema)
52 (Diabetic Macular Edema)
consequents antecedent support \
0 (Dyslipidemia) 0.8075
1 (Hypertension) 0.5575
2 (Hypertension) 0.3675
3 (RETINOPATHY) 0.3675
4 (Hypertension) 0.2275
5 (RETINOPATHY) 0.3325
6 (RETINOPATHY, Hypertension) 0.3675
7 (Hypertension) 0.2800
8 (RETINOPATHY) 0.2800
9 (Hypertension) 0.2025
10 (RETINOPATHY) 0.2750
11 (RETINOPATHY, Hypertension) 0.2800
12 (Dyslipidemia) 0.4400
13 (Maculopathy) 0.8075
14 (Hypertension) 0.4400
15 (Maculopathy) 0.4850
16 (RETINOPATHY) 0.4400
17 (Dyslipidemia) 0.3725
18 (Maculopathy) 0.3000
19 (RETINOPATHY) 0.2600
20 (RETINOPATHY, Dyslipidemia) 0.4400
21 (Dyslipidemia) 0.4100
22 (Maculopathy) 0.4525
23 (Hypertension) 0.2600
24 (Hypertension, Dyslipidemia) 0.4400
25 (Dyslipidemia) 0.3600
26 (Maculopathy) 0.2800
27 (Hypertension) 0.2350
28 (RETINOPATHY) 0.2325
29 (Hypertension, Dyslipidemia) 0.3725
30 (Maculopathy, Hypertension) 0.3000
31 (RETINOPATHY, Dyslipidemia) 0.4100
32 (RETINOPATHY, Hypertension) 0.2600
33 (RETINOPATHY, Hypertension, Dyslipidemia) 0.4400
34 (Maculopathy) 0.4650
35 (Hypertension) 0.3725
36 (RETINOPATHY) 0.4100
37 (Maculopathy, Hypertension) 0.4850
38 (RETINOPATHY, Hypertension) 0.4400
39 (Hypertension) 0.4850
40 (RETINOPATHY) 0.8075
41 (Dyslipidemia) 0.4850
42 (RETINOPATHY) 0.5575
43 (Dyslipidemia) 0.4650
44 (Hypertension) 0.3000
45 (RETINOPATHY) 0.4525
46 (Hypertension, Dyslipidemia) 0.4850
47 (RETINOPATHY, Hypertension) 0.5575
48 (RETINOPATHY) 0.2350
49 (Hypertension) 0.2350
50 (Hypertension) 0.2350
51 (RETINOPATHY) 0.2225
52 (RETINOPATHY, Hypertension) 0.2350
consequent support support confidence lift leverage conviction \
0 0.5575 0.4525 0.560372 1.005151 0.002319 1.006532
1 0.8075 0.4525 0.811659 1.005151 0.002319 1.022083
2 0.8075 0.3325 0.904762 1.120448 0.035744 2.021250
3 0.4850 0.2275 0.619048 1.276387 0.049263 1.351875
4 0.8075 0.2225 0.978022 1.211173 0.038794 8.758750
5 0.4850 0.2225 0.669173 1.379738 0.061238 1.556705
6 0.4650 0.2225 0.605442 1.302026 0.051613 1.355948
7 0.8075 0.2750 0.982143 1.216276 0.048900 10.780000
8 0.4850 0.2025 0.723214 1.491163 0.066700 1.860645
9 0.8075 0.2025 1.000000 1.238390 0.038981 inf
10 0.4850 0.2025 0.736364 1.518276 0.069125 1.953448
11 0.4650 0.2025 0.723214 1.555300 0.072300 1.932903
12 0.5575 0.2600 0.590909 1.059927 0.014700 1.081667
13 0.4400 0.4100 0.507740 1.153954 0.054700 1.137610
14 0.8075 0.4100 0.931818 1.153954 0.054700 2.823333
15 0.4400 0.3725 0.768041 1.745548 0.159100 2.414222
16 0.4850 0.3725 0.846591 1.745548 0.159100 3.357037
17 0.5575 0.2350 0.630872 1.131610 0.027331 1.198773
18 0.4400 0.2350 0.783333 1.780303 0.103000 2.584615
19 0.4850 0.2350 0.903846 1.863600 0.108900 5.356000
20 0.3000 0.2350 0.534091 1.780303 0.103000 1.502439
21 0.5575 0.2325 0.567073 1.017172 0.003925 1.022113
22 0.4400 0.2325 0.513812 1.167755 0.033400 1.151818
23 0.8075 0.2325 0.894231 1.107407 0.022550 1.820000
24 0.4525 0.2325 0.528409 1.167755 0.033400 1.160964
25 0.5575 0.2225 0.618056 1.108620 0.021800 1.158545
26 0.4400 0.2225 0.794643 1.806006 0.099300 2.726957
27 0.8075 0.2225 0.946809 1.172518 0.032738 3.619000
28 0.4850 0.2225 0.956989 1.973174 0.109738 11.973750
29 0.4525 0.2225 0.597315 1.320034 0.053944 1.359625
30 0.4100 0.2225 0.741667 1.808943 0.099500 2.283871
31 0.3000 0.2225 0.542683 1.808943 0.099500 1.530667
32 0.4650 0.2225 0.855769 1.840364 0.101600 3.709333
33 0.2800 0.2225 0.505682 1.806006 0.099300 1.456552
34 0.4400 0.3600 0.774194 1.759531 0.155400 2.480000
35 0.8075 0.3600 0.966443 1.196833 0.059206 5.736500
36 0.4850 0.3600 0.878049 1.810410 0.161150 4.223000
37 0.4100 0.3600 0.742268 1.810410 0.161150 2.289200
38 0.4650 0.3600 0.818182 1.759531 0.155400 2.942500
39 0.8075 0.4650 0.958763 1.187322 0.073363 4.668125
40 0.4850 0.4650 0.575851 1.187322 0.073363 1.214197
41 0.5575 0.3000 0.618557 1.109519 0.029612 1.160068
42 0.4850 0.3000 0.538117 1.109519 0.029612 1.115000
43 0.5575 0.2800 0.602151 1.080091 0.020763 1.112230
44 0.8075 0.2800 0.933333 1.155831 0.037750 2.887500
45 0.4850 0.2800 0.618785 1.275844 0.060538 1.350942
46 0.4525 0.2800 0.577320 1.275844 0.060538 1.295305
47 0.4650 0.2800 0.502242 1.080091 0.020763 1.074820
48 0.4850 0.2350 1.000000 2.061856 0.121025 inf
49 0.8075 0.2225 0.946809 1.172518 0.032738 3.619000
50 0.8075 0.2225 0.946809 1.172518 0.032738 3.619000
51 0.4850 0.2225 1.000000 2.061856 0.114588 inf
52 0.4650 0.2225 0.946809 2.036147 0.113225 10.058000
zhangs_metric Antecedents \
0 0.026620 [Hypertension: Present]
1 0.011580 [Dyslipidemia: Present]
2 0.169960 [Gender: Present]
3 0.342353 [Gender: Present]
4 0.225701 [RETINOPATHY: Present, Gender: Present]
5 0.412322 [Hypertension: Present, Gender: Present]
6 0.366745 [Gender: Present]
7 0.246970 [Chronic Kidney Disease: Present]
8 0.457476 [Chronic Kidney Disease: Present]
9 0.241379 [Chronic Kidney Disease: Present, RETINOPATHY:...
10 0.470839 [Chronic Kidney Disease: Present, Hypertension...
11 0.495885 [Chronic Kidney Disease: Present]
12 0.100962 [Maculopathy: Present]
13 0.693063 [Hypertension: Present]
14 0.238240 [Maculopathy: Present]
15 0.829348 [RETINOPATHY: Present]
16 0.762704 [Maculopathy: Present]
17 0.185344 [RETINOPATHY: Present, Maculopathy: Present]
18 0.626140 [RETINOPATHY: Present, Dyslipidemia: Present]
19 0.626222 [Maculopathy: Present, Dyslipidemia: Present]
20 0.782675 [Maculopathy: Present]
21 0.028613 [Maculopathy: Present, Hypertension: Present]
22 0.262385 [Hypertension: Present, Dyslipidemia: Present]
23 0.131067 [Maculopathy: Present, Dyslipidemia: Present]
24 0.256528 [Maculopathy: Present]
25 0.153090 [RETINOPATHY: Present, Hypertension: Present, ...
26 0.619850 [RETINOPATHY: Present, Hypertension: Present, ...
27 0.192333 [RETINOPATHY: Present, Maculopathy: Present, D...
28 0.642609 [Maculopathy: Present, Hypertension: Present, ...
29 0.386365 [RETINOPATHY: Present, Maculopathy: Present]
30 0.638844 [RETINOPATHY: Present, Dyslipidemia: Present]
31 0.757951 [Maculopathy: Present, Hypertension: Present]
32 0.617067 [Maculopathy: Present, Dyslipidemia: Present]
33 0.796950 [Maculopathy: Present]
34 0.806854 [RETINOPATHY: Present, Hypertension: Present]
35 0.262091 [RETINOPATHY: Present, Maculopathy: Present]
36 0.758710 [Maculopathy: Present, Hypertension: Present]
37 0.869202 [RETINOPATHY: Present]
38 0.770833 [Maculopathy: Present]
39 0.306347 [RETINOPATHY: Present]
40 0.819578 [Hypertension: Present]
41 0.191667 [RETINOPATHY: Present]
42 0.223070 [Dyslipidemia: Present]
43 0.138601 [RETINOPATHY: Present, Hypertension: Present]
44 0.192602 [RETINOPATHY: Present, Dyslipidemia: Present]
45 0.394896 [Hypertension: Present, Dyslipidemia: Present]
46 0.419816 [RETINOPATHY: Present]
47 0.167575 [Dyslipidemia: Present]
48 0.673203 [Diabetic Macular Edema: Present]
49 0.192333 [Diabetic Macular Edema: Present]
50 0.192333 [RETINOPATHY: Present, Diabetic Macular Edema:...
51 0.662379 [Hypertension: Present, Diabetic Macular Edema...
52 0.665198 [Diabetic Macular Edema: Present]
Consequents
0 [Dyslipidemia: Present]
1 [Hypertension: Present]
2 [Hypertension: Present]
3 [RETINOPATHY: Present]
4 [Hypertension: Present]
5 [RETINOPATHY: Present]
6 [RETINOPATHY: Present, Hypertension: Present]
7 [Hypertension: Present]
8 [RETINOPATHY: Present]
9 [Hypertension: Present]
10 [RETINOPATHY: Present]
11 [RETINOPATHY: Present, Hypertension: Present]
12 [Dyslipidemia: Present]
13 [Maculopathy: Present]
14 [Hypertension: Present]
15 [Maculopathy: Present]
16 [RETINOPATHY: Present]
17 [Dyslipidemia: Present]
18 [Maculopathy: Present]
19 [RETINOPATHY: Present]
20 [RETINOPATHY: Present, Dyslipidemia: Present]
21 [Dyslipidemia: Present]
22 [Maculopathy: Present]
23 [Hypertension: Present]
24 [Hypertension: Present, Dyslipidemia: Present]
25 [Dyslipidemia: Present]
26 [Maculopathy: Present]
27 [Hypertension: Present]
28 [RETINOPATHY: Present]
29 [Hypertension: Present, Dyslipidemia: Present]
30 [Maculopathy: Present, Hypertension: Present]
31 [RETINOPATHY: Present, Dyslipidemia: Present]
32 [RETINOPATHY: Present, Hypertension: Present]
33 [RETINOPATHY: Present, Hypertension: Present, ...
34 [Maculopathy: Present]
35 [Hypertension: Present]
36 [RETINOPATHY: Present]
37 [Maculopathy: Present, Hypertension: Present]
38 [RETINOPATHY: Present, Hypertension: Present]
39 [Hypertension: Present]
40 [RETINOPATHY: Present]
41 [Dyslipidemia: Present]
42 [RETINOPATHY: Present]
43 [Dyslipidemia: Present]
44 [Hypertension: Present]
45 [RETINOPATHY: Present]
46 [Hypertension: Present, Dyslipidemia: Present]
47 [RETINOPATHY: Present, Hypertension: Present]
48 [RETINOPATHY: Present]
49 [Hypertension: Present]
50 [Hypertension: Present]
51 [RETINOPATHY: Present]
52 [RETINOPATHY: Present, Hypertension: Present]
rows = []
# Iterate through the rules DataFrame to extract information
for index, row in rules.iterrows():
antecedents = ', '.join(row['antecedents'])
consequents = ', '.join(row['consequents'])
support = row['support']
confidence = row['confidence']
lift = row['lift']
# Append a new row to the list of rows
rows.append({'Antecedents': antecedents, 'Consequents': consequents,
'Support': support, 'Confidence': confidence, 'Lift': lift})
# Convert the list of rows into a DataFrame
table = pd.DataFrame(rows, columns=['Antecedents', 'Consequents', 'Support', 'Confidence', 'Lift'])
# Display the generated table
(table.tail(10))
| Antecedents | Consequents | Support | Confidence | Lift | |
|---|---|---|---|---|---|
| 43 | RETINOPATHY, Hypertension | Dyslipidemia | 0.2800 | 0.602151 | 1.080091 |
| 44 | RETINOPATHY, Dyslipidemia | Hypertension | 0.2800 | 0.933333 | 1.155831 |
| 45 | Hypertension, Dyslipidemia | RETINOPATHY | 0.2800 | 0.618785 | 1.275844 |
| 46 | RETINOPATHY | Hypertension, Dyslipidemia | 0.2800 | 0.577320 | 1.275844 |
| 47 | Dyslipidemia | RETINOPATHY, Hypertension | 0.2800 | 0.502242 | 1.080091 |
| 48 | Diabetic Macular Edema | RETINOPATHY | 0.2350 | 1.000000 | 2.061856 |
| 49 | Diabetic Macular Edema | Hypertension | 0.2225 | 0.946809 | 1.172518 |
| 50 | RETINOPATHY, Diabetic Macular Edema | Hypertension | 0.2225 | 0.946809 | 1.172518 |
| 51 | Hypertension, Diabetic Macular Edema | RETINOPATHY | 0.2225 | 1.000000 | 2.061856 |
| 52 | Diabetic Macular Edema | RETINOPATHY, Hypertension | 0.2225 | 0.946809 | 2.036147 |
Frequent Itemsets:
Association Rules:
Interpreted Rules:
df_binary['Gender'] = df_binary['Gender'].replace({0:'Female', 1:'Male'})
df_binary['Maculopathy'] = df_binary['Maculopathy'].replace({0:'No Maculopathy', 1:'Yes Maculopathy'})
df_binary['Hypertension'] = df_binary['Hypertension'].replace({0:'No Hypertension', 1:'Yes Hypertension'})
df_binary['Dyslipidemia'] = df_binary['Dyslipidemia'].replace({0:'No Dyslipidemia', 1:'Yes Dyslipidemia'})
df_binary['Cardiovascular Disease'] = df_binary['Cardiovascular Disease'].replace({0:'Cardiovascular No', 1:'Cardiovascular Yes'})
df_binary['Diabetic Neuropathy'] = df_binary['Diabetic Neuropathy'].replace({0:'No Neuropathy', 1:'Yes Neuropathy'})
df_binary['Diabetic Macular Edema'] = df_binary['Diabetic Macular Edema'].replace({0:'No Macular Edema', 1:'Yes Macular Edema'})
df_binary['Chronic Kidney Disease'] = df_binary['Chronic Kidney Disease'].replace({0:'No Kidney Disease', 1:'Yes Kidney Disease'})
df_binary['RETINOPATHY'] = df_binary['RETINOPATHY'].replace({0:'No RETINOPATHY', 1:'Yes RETINOPATHY'})
df_binary.columns = [0,1,2,3,4,5,6,7,8]
df_binary = df_binary.reset_index(drop = True)
df_binary.head(10)
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Female | No Maculopathy | No Hypertension | No Dyslipidemia | Cardiovascular No | No Neuropathy | No Macular Edema | No Kidney Disease | No RETINOPATHY |
| 1 | Female | No Maculopathy | No Hypertension | Yes Dyslipidemia | Cardiovascular No | No Neuropathy | No Macular Edema | No Kidney Disease | No RETINOPATHY |
| 2 | Female | No Maculopathy | Yes Hypertension | Yes Dyslipidemia | Cardiovascular No | No Neuropathy | No Macular Edema | No Kidney Disease | No RETINOPATHY |
| 3 | Male | No Maculopathy | Yes Hypertension | No Dyslipidemia | Cardiovascular No | No Neuropathy | No Macular Edema | Yes Kidney Disease | No RETINOPATHY |
| 4 | Female | Yes Maculopathy | No Hypertension | Yes Dyslipidemia | Cardiovascular No | No Neuropathy | No Macular Edema | No Kidney Disease | No RETINOPATHY |
| 5 | Female | No Maculopathy | Yes Hypertension | No Dyslipidemia | Cardiovascular No | No Neuropathy | No Macular Edema | No Kidney Disease | No RETINOPATHY |
| 6 | Female | No Maculopathy | Yes Hypertension | Yes Dyslipidemia | Cardiovascular No | No Neuropathy | No Macular Edema | Yes Kidney Disease | No RETINOPATHY |
| 7 | Female | Yes Maculopathy | Yes Hypertension | Yes Dyslipidemia | Cardiovascular No | No Neuropathy | No Macular Edema | No Kidney Disease | Yes RETINOPATHY |
| 8 | Female | Yes Maculopathy | Yes Hypertension | Yes Dyslipidemia | Cardiovascular No | No Neuropathy | Yes Macular Edema | No Kidney Disease | Yes RETINOPATHY |
| 9 | Female | No Maculopathy | Yes Hypertension | Yes Dyslipidemia | Cardiovascular No | No Neuropathy | No Macular Edema | Yes Kidney Disease | No RETINOPATHY |
from pyECLAT import ECLAT
# loading transactions DataFrame to ECLAT class
eclat = ECLAT(data=df_binary, verbose=True)
# DataFrame of binary values
eclat.df_bin
100%|█████████████████████████████████████████████████████████████████████████████████| 18/18 [00:00<00:00, 428.21it/s] 100%|██████████████████████████████████████████████████████████████████████████████████████████| 18/18 [00:00<?, ?it/s] 100%|████████████████████████████████████████████████████████████████████████████████| 18/18 [00:00<00:00, 3000.69it/s]
| Yes Hypertension | Male | No Macular Edema | Cardiovascular Yes | Yes Neuropathy | No Neuropathy | Yes Kidney Disease | No Dyslipidemia | Yes Dyslipidemia | Yes Macular Edema | No RETINOPATHY | Yes Maculopathy | No Maculopathy | Female | Yes RETINOPATHY | No Hypertension | No Kidney Disease | Cardiovascular No | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 1 | 1 | 1 |
| 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 1 | 0 | 1 | 1 | 1 |
| 2 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 1 | 1 |
| 3 | 1 | 1 | 1 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
| 4 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 1 | 0 | 1 | 1 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 395 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 1 |
| 396 | 1 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 |
| 397 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 1 |
| 398 | 1 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 1 |
| 399 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 1 |
400 rows × 18 columns
# count items in each column
items_total = eclat.df_bin.astype(int).sum(axis=0)
items_total
Yes Hypertension 323 Male 147 No Macular Edema 306 Cardiovascular Yes 12 Yes Neuropathy 34 No Neuropathy 366 Yes Kidney Disease 112 No Dyslipidemia 177 Yes Dyslipidemia 223 Yes Macular Edema 94 No RETINOPATHY 206 Yes Maculopathy 176 No Maculopathy 224 Female 253 Yes RETINOPATHY 194 No Hypertension 77 No Kidney Disease 288 Cardiovascular No 388 dtype: int64
# count items in each row
items_per_transaction = eclat.df_bin.astype(int).sum(axis=1)
items_per_transaction
0 9
1 9
2 9
3 9
4 9
..
395 9
396 9
397 9
398 9
399 9
Length: 400, dtype: int64
import pandas as pd
# Loading items per column stats to the DataFrame
df = pd.DataFrame({'items': items_total.index, 'transactions': items_total.values})
# cloning pandas DataFrame for visualization purpose
df_table = df.sort_values("transactions", ascending=False)
# Top 5 most popular products/items
df_table.head(5).style.background_gradient(cmap='Blues')
| items | transactions | |
|---|---|---|
| 17 | Cardiovascular No | 388 |
| 5 | No Neuropathy | 366 |
| 0 | Yes Hypertension | 323 |
| 2 | No Macular Edema | 306 |
| 16 | No Kidney Disease | 288 |
#!pip install --upgrade jinja2
# importing required module
import plotly.express as px
# to have a same origin
df_table["all"] = "Tree Map"
# creating tree map using plotly
fig = px.treemap(df_table.head(50), path=['all', "items"], values='transactions',
color=df_table["transactions"].head(50), hover_data=['items'],
color_continuous_scale='Blues',
)
# ploting the treemap
fig.show()
Note: the higher the value of the maximum combinations the longer the calculation will take.
# the item shoud appear at least at 5% of transactions
min_support = 5/100
# start from transactions containing at least 2 items
min_combination = 2
# up to maximum items per transaction
max_combination = max(items_per_transaction)
rule_indices, rule_supports = eclat.fit(min_support=min_support,
min_combination=min_combination,
max_combination=max_combination,
separator=' & ',
verbose=True)
Combination 2 by 2
136it [00:01, 111.20it/s]
Combination 3 by 3
680it [00:05, 121.08it/s]
Combination 4 by 4
2380it [00:19, 119.24it/s]
Combination 5 by 5
6188it [00:47, 130.14it/s]
Combination 6 by 6
12376it [01:25, 144.24it/s]
Combination 7 by 7
19448it [02:21, 137.92it/s]
Combination 8 by 8
24310it [03:26, 117.87it/s]
Combination 9 by 9
24310it [04:05, 99.03it/s]
import pandas as pd
result = pd.DataFrame(rule_supports.items(),columns=['Item', 'Support'])
result.sort_values(by=['Support'], ascending=False).head(20)
| Item | Support | |
|---|---|---|
| 54 | No Neuropathy & Cardiovascular No | 0.8850 |
| 14 | Yes Hypertension & Cardiovascular No | 0.7850 |
| 3 | Yes Hypertension & No Neuropathy | 0.7375 |
| 39 | No Macular Edema & Cardiovascular No | 0.7375 |
| 146 | Yes Hypertension & No Neuropathy & Cardiovascu... | 0.7150 |
| 28 | No Macular Edema & No Neuropathy | 0.7025 |
| 111 | No Kidney Disease & Cardiovascular No | 0.6975 |
| 263 | No Macular Edema & No Neuropathy & Cardiovascu... | 0.6750 |
| 53 | No Neuropathy & No Kidney Disease | 0.6575 |
| 369 | No Neuropathy & No Kidney Disease & Cardiovasc... | 0.6350 |
| 106 | Female & Cardiovascular No | 0.6150 |
| 1 | Yes Hypertension & No Macular Edema | 0.5850 |
| 50 | No Neuropathy & Female | 0.5850 |
| 38 | No Macular Edema & No Kidney Disease | 0.5825 |
| 364 | No Neuropathy & Female & Cardiovascular No | 0.5675 |
| 134 | Yes Hypertension & No Macular Edema & Cardiova... | 0.5650 |
| 311 | No Macular Edema & No Kidney Disease & Cardiov... | 0.5600 |
| 102 | No Maculopathy & Cardiovascular No | 0.5400 |
| 124 | Yes Hypertension & No Macular Edema & No Neuro... | 0.5375 |
| 81 | Yes Dyslipidemia & Cardiovascular No | 0.5350 |
import pandas as pd
antecedents = [
"Cardiovascular No & No Neuropathy",
"Cardiovascular No & Yes Hypertension",
"No Macular Edema & Cardiovascular No",
"Yes Hypertension & No Neuropathy",
"Cardiovascular No & Yes Hypertension & No Neuropathy",
"Female & No Neuropathy",
"Yes Hypertension & Female",
"Yes Dyslipidemia & Cardiovascular No",
"Yes Maculopathy & Female",
"Yes Macular Edema & Female",
]
consequents = [
"Yes RETINOPATHY",
"Yes RETINOPATHY",
"Yes RETINOPATHY",
"Yes RETINOPATHY",
"Yes RETINOPATHY",
"Yes RETINOPATHY",
"Yes RETINOPATHY",
"Yes RETINOPATHY",
"Yes RETINOPATHY",
"Yes RETINOPATHY",
]
support = [
0.8849,
0.7801,
0.7442,
0.7340,
0.7110,
0.6705,
0.6620,
0.6455,
0.6280,
0.6115,
]
total_transactions = 391
# Creating DataFrame for Antecedents, Consequents, and Support
rules_df = pd.DataFrame({
'Antecedents': antecedents,
'Consequents': consequents,
'Support': support
})
# Calculate Confidence
confidences = []
for index, row in rules_df.iterrows():
antecedent = row['Antecedents']
support_antecedent = support[antecedents.index(antecedent)]
support_antecedent_consequent = row['Support']
confidence = support_antecedent_consequent / support_antecedent
confidences.append(confidence)
rules_df['Confidence'] = confidences
# Calculate Lift
lifts = []
for index, row in rules_df.iterrows():
antecedent = row['Antecedents']
support_antecedent = support[antecedents.index(antecedent)]
support_consequent = row['Support']
support_antecedent_consequent = support_consequent
lift = support_antecedent_consequent / (support_antecedent * total_transactions)
lifts.append(lift)
rules_df['Lift'] = lifts
# Sorting the DataFrame by Support in descending order
rules_df = rules_df.sort_values(by='Support', ascending=False)
# Selecting the top 10 rules
top_10_rules = rules_df.head(10)
# Printing the top 10 rules
top_10_rules
| Antecedents | Consequents | Support | Confidence | Lift | |
|---|---|---|---|---|---|
| 0 | Cardiovascular No & No Neuropathy | Yes RETINOPATHY | 0.8849 | 1.0 | 0.002558 |
| 1 | Cardiovascular No & Yes Hypertension | Yes RETINOPATHY | 0.7801 | 1.0 | 0.002558 |
| 2 | No Macular Edema & Cardiovascular No | Yes RETINOPATHY | 0.7442 | 1.0 | 0.002558 |
| 3 | Yes Hypertension & No Neuropathy | Yes RETINOPATHY | 0.7340 | 1.0 | 0.002558 |
| 4 | Cardiovascular No & Yes Hypertension & No Neur... | Yes RETINOPATHY | 0.7110 | 1.0 | 0.002558 |
| 5 | Female & No Neuropathy | Yes RETINOPATHY | 0.6705 | 1.0 | 0.002558 |
| 6 | Yes Hypertension & Female | Yes RETINOPATHY | 0.6620 | 1.0 | 0.002558 |
| 7 | Yes Dyslipidemia & Cardiovascular No | Yes RETINOPATHY | 0.6455 | 1.0 | 0.002558 |
| 8 | Yes Maculopathy & Female | Yes RETINOPATHY | 0.6280 | 1.0 | 0.002558 |
| 9 | Yes Macular Edema & Female | Yes RETINOPATHY | 0.6115 | 1.0 | 0.002558 |
Frequent Itemsets:
Association Rules:
Interpretation:
Efficiency:
Memory Usage:
Ease of Implementation:
Association Rules: